#!/usr/bin/perl
#Name    	: 5_Final_validation.pl
#Author  	: Morgan, Matthew
#Created 	: 07/2010
#Modified	: 03/2012
#Purpose	: Collate validation information for each sequence and output according to validation status 
#Syntax		: 5_Final_validation.pl [script3 validated sequences output] [Column number containing first MID info]
#Further info	: Further information regarding this script and APDP can be found in the documentation downloaded with this file, and in Morgan et al., (in review)
#Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.

#########################################################################################################################################################	
#																			#
#CSIRO Open Source Software License Agreement (GPLv3)													#
#																			#
#Copyright (c) 2010, 2012 Commonwealth Scientific and Industrial Research Organisation (CSIRO) ABN 41 687 119 230.					#
#																			#
#All rights reserved. CSIRO is willing to grant you a license to APDP on the terms of the GNU General Public License version 3				#
# as published by the Free Software Foundation (http://www.gnu.org/licenses/gpl.html), except where otherwise indicated for third party material.	#
#The following additional terms apply under clause 7 of that license:											#
#																			#
#EXCEPT AS EXPRESSLY STATED IN THIS LICENCE AND TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO AND ITS		#
#CONTRIBUTORS MAKE NO REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS,	#
#WARRANTIES OR CONDITIONS REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE,		#
#NON-INFRINGEMENT, THE ABSENCE OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE.				#
#																			#
#TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO OR ITS CONTRIBUTORS BE LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT		#
#LIMITATION, IN AN ACTION FOR BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR OTHER LIABILITY HOWSOEVER INCURRED.		#
#WITHOUT LIMITING THE SCOPE OF THE PREVIOUS SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR OPERATION TIME, LOSS,		#
#DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC LOSS;		#
#OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS LICENCE, THE USE		#
#OF THE SOFTWARE OR THE USE OF OR OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO OR ITS CONTRIBUTORS HAVE BEEN ADVISED OF THE POSSIBILITY OF		#
#SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY.														#
#																			#
#APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY IMPLY REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS		#
#OR LIABILITY ON CSIRO OR ONE OF ITS CONTRIBUTORS IN RESPECT OF THE SOFTWARE THAT CANNOT BE WHOLLY OR PARTLY EXCLUDED, RESTRICTED OR			#
#MODIFIED "CONSUMER GUARANTEES".  IF SUCH CONSUMER GUARANTEES APPLY THEN THE LIABILITY OF CSIRO AND ITS CONTRIBUTORS IS LIMITED, TO THE FULL		#
#EXTENT PERMITTED BY THE APPLICABLE LEGISLATION.  WHERE THE APPLICABLE LEGISLATION PERMITS THE FOLLOWING REMEDIES TO BE PROVIDED FOR BREACH OF		#
#THE CONSUMER GUARANTEES THEN, AT ITS OPTION, CSIRO'S LIABILITY IS LIMITED TO ANY ONE OR MORE OF THEM:							#
#1.          THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN;					#
#2.          THE REPAIR OF THE SOFTWARE; 														#
#3.          THE PAYMENT OF THE COST OF REPLACING THE SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE RELEVANT SERVICES SUPPLIED AGAIN,		#
#	     OR HAVING THE SOFTWARE REPAIRED.														#
#																			#
#########################################################################################################################################################

use strict;
use warnings;

#Read in datafile containing replicate information

my $repfile;

print "\nReplicate information file : ";
$repfile = <STDIN>;
chomp($repfile);
if (!$repfile) {
	print "\nTotal number of samples : ";
	my $total = <STDIN>;
	chomp($total);
	die "Need to know number of samples\n" unless ($total);
	print "\nMinimum number of valid observations to retain sequence (default = 1) : ";
	my $minim = <STDIN>;
	chomp($minim);
	$minim = 1 unless ($minim);	
	open (TEMP, ">temp_rep_file");
	print TEMP "ALL\t$total\t$minim\t1-$total\n";
	close TEMP;
	$repfile = "temp_rep_file";
}
my $detect;
print "\nMinimum number of reads for detection (default = 2 - singletons will be excluded from validated_sequences_valid_reads file) : ";
$detect = (<STDIN>);
chomp($detect);
$detect = 2 unless ($detect);

my ( %samples, %numvalid, $totalmids, %anysample, %record );

open( REPS, "<$repfile" || die "Can't find replicates file");
while (<REPS>) {
    chomp;
    s/\r?\n//;
    my ( $expt, $other, $minvalids, $mids ) = split( /\s+/, $_ );
    $totalmids = $other;    
    
    my @midarray = split( /,/, $mids );
    
    my @temp;
    foreach (@midarray) {

        if ( $_ !~ m/-/ ) {
		my $elem = $_ - 1;            
		push @temp, $elem;
        }
        else {
            my ( $a, $b ) = split( /-/, $_ );
            my $c = $a - 1;
	    my $d = $b - 1;
	    my @new = ( $c .. $d );
            foreach (@new) {
                push @temp, $_;
            }
        }
    }
    
    $numvalid{$expt} = $minvalids;
    $samples{$expt}  = \@temp;
}
close(REPS);
system 'rm temp_rep_file';

#For each experiment, evaluate each sequence to assess how many samples it it valid or invalid in.  Use criteria in replicates file to asess sequence as valid or invalid.

my $exptcount = 0;
my $nexpts    = scalar ( keys %samples );
for my $exp ( keys %samples ) {
    print "$exp\n";
    $exptcount++;
    mkdir $exp;
    my %invalid;
    my %valid;
    my @mids = @{ $samples{$exp} };
    print "$exp MIDS are : @mids\n";
    foreach (@mids) {
	print "$_\n";    
        open( MID, "<Validation_by_sample/validation.mid$_" );
        while (<MID>) {
            s/\r?\n//;
            my ( $name, $status ) = split( /\t/, $_ );
            if ( $status eq 'INVALID' ) {
                if ( defined( $invalid{$name} ) ) {
                    $invalid{$name}++;
                }
                else {
                    $invalid{$name} = 1;
                }
            }
            elsif ( $status eq 'VALID' ) {
                if ( defined( $valid{$name} ) ) {
                    $valid{$name}++;
                }
                else {
                    $valid{$name} = 1;
                }
            }
        }
        close(MID);
    }
    @mids = @{ $samples{$exp} };
    
    my %names;
    open( SEQS, "<$ARGV[0]" );
    open( OUT,  ">$exp/Secondary_validation.txt" );
    my $lines;
    while (<SEQS>) {
	unless (/^name/) {        
		$lines++;
        	chomp;
        	my @tmp = split( /\t/, $_ );
        	my $name = $tmp[0];
        	$names{$name} = 1;
        	if ( !exists( $valid{$name} ) ) {
            		$valid{$name} = 0;
        	}
        	if ( !exists( $invalid{$name} ) ) {
            		$invalid{$name} = 0;
        	}
        	print OUT "$name\t$valid{$name}\t$invalid{$name}\n";
    
	}
    }
    close(SEQS);
    close(OUT);

    my $cutoff = $numvalid{$exp};
    my %status;
    open( VAL, "<$exp/Secondary_validation.txt" );
    while (<VAL>) {
        s/\r?\n//;
        my ( $name, $valid, $invalid ) = split( /\t/, $_ );
	$record{$exp}{$name} = 0;     
	if ( $valid >= $cutoff ) {

            $status{$name}       = 'valid';
	    $anysample{$name}    = 1; 
	    $record{$exp}{$name} = 1;	
        
	}
        elsif ( ( $valid == 0 ) && ( $invalid > 0 ) ) {

            $status{$name} = 'invalid';
	   	        
	}
        else { #valid in <$cutoff samples

            	$status{$name} = 'unknown';
	    #run check for complementary indels.
	    #Criteria
	    #1) in all samples?
            #2) must always be invalid because of the same sequence, and must invalidate that sequence at least once	
		
		    
		@mids = @{ $samples{$exp} };
		my %excludedby = ();
		my %excludes   = ();		
		my $seenin     = 0;		
		open (COMP, "<Indel_sequence_pairs.txt")||die;
		while (my $line = <COMP>) {
			for (@mids) {			
				if ($line =~ m/^$_/) {
					chomp;
					my @comp = split (/\s+/,$line);
					if ( $comp[1] eq $name ) {
						$excludedby{$comp[3]}++;						
						$seenin++;
					}
					elsif ( $comp[3] eq $name ) {
						$excludes{$comp[1]}++;						
						$seenin++;
					}
				}
			}
		}
		my @keys = keys %excludedby;		
		my $nkeys = scalar(@keys);		
		
		my $reject = 0;
		if ($nkeys == 0) {
			$reject++;
		}
		else {
			for my $ex (keys %excludedby) {
				if ( !exists ($excludes{$ex} ) ) { 
					
					$reject++;
					
				}
				elsif ( ( $excludedby{$ex} + $excludes{$ex} ) != scalar(@mids) ) { 
					
					$reject++;
					
				}
			}
		}		
		
		if ($reject == 0) {
			$status{$name} = 'valid';
			$anysample{$name}    = 1; 
	    		$record{$exp}{$name} = 1;			
		}
		else {
			$status{$name} = 'unknown';
		}		
				
	}
    }
    close(VAL);
    open( IN,  "<$ARGV[0]" );
    open( FNA, ">$exp/valid.fna" );
    while (<IN>) {
        s/\r?\n//;
	my $safe = $_;        
	my @tmp        = split( /\t/, $_ );
	my @tmp2       = @tmp;
	my $firstmid   = $ARGV[1] - 1;
	my $totalscol  = $ARGV[1] - 2;
	my @gendata    = @tmp[0..$totalscol];
	my $geninfo    = join( "\t", @gendata ); 
	if (/^name/) {
		@mids = @{ $samples{$exp} };
		my @midnames = ();		
		for ( @mids ) {
			my $comb = $tmp[ $_ + $firstmid ];
			push @midnames, $comb;
		}		
		my $midnamedata = join( "\t", @midnames);		
		
		open ( OUT2, ">$exp/invalid");
		open ( OUT3, ">$exp/unknown");
		open ( OUT4, ">$exp/Final_validated_sequences_invalid_reads_removed.txt");		
		open ( OUT5, ">$exp/Final_validated_sequences_invalid_reads_retained.txt");
		
		print OUT2 "$geninfo\t$midnamedata\n";
		print OUT3 "$geninfo\t$midnamedata\n";
		print OUT4 "$geninfo\t$midnamedata\n";
		print OUT5 "$geninfo\t$midnamedata\n";
		
		close (OUT2);
		close (OUT3);
		close (OUT4);
		close (OUT5);
		
	}	 
        else {
		my $seqname  = $tmp[0];
		
		my $outfile = $status{$seqname};
		unless ( $status{$seqname} eq 'valid' ) {		
			open ( ALLOUT, ">>$exp/$outfile" );
			print ALLOUT "$safe\n";
		}
		
        	if ( $status{$seqname} eq 'valid' ) {
			print FNA ">$tmp[0]\n$tmp[4]\n";
            		my $sum  = 0;
			my $sum2 = 0;            		
			@mids = @{ $samples{$exp} };
            		my %hold  = ();
			my %hold2 = ();
			for ( my $r = 0 ; $r < $totalmids ; $r++ ) {
                		if ( grep { $_ eq $r } @mids ) {
					open( VALS, "<Validation_by_sample/validation.mid$r" ) || warn ("Can't open sample validation files for MID $r\n");
                    			while (<VALS>) {
                        			if ( (/$seqname/) && (/INVALID/) ) {
                            				$tmp[ $r + $firstmid ] = 0;
                        			}
                    			}
                    			if ( $tmp[ $r + $firstmid ] < $detect ) {
                        			$tmp[ $r + $firstmid ] = 0;
                    			}
                    			$sum  += $tmp[ $r + $firstmid ];
					$sum2 += $tmp2[ $r + $firstmid ];					
					$hold{$r}  =  $tmp[ $r + $firstmid ];
					$hold2{$r} =  $tmp2[ $r + $firstmid ];
				}
                		else {
                    			$tmp[ $r + $firstmid ] = 0;
					$tmp2[ $r + $firstmid ] = 0;
                		}
            		}
			my $check   = 0;			
			my @counts  = ();
			my @counts2 = ();
			for my $cnt (sort {$a<=>$b} keys %hold) {
				$check += $hold{$cnt};
				push @counts, $hold{$cnt};
				push @counts2, $hold2{$cnt};			
			}
			if ( $check != $sum ) { die ("Check did not equal Sum :(\nCheck:\t$check\nSum:\t$sum\n"); }           	
			unless ( $sum == 0 ) {  
				$gendata[-1] = $sum;
				my $genline = join( "\t", @gendata );                		
				my $line = join( "\t", @counts );
                		open( OUT4, ">>$exp/Final_validated_sequences_invalid_reads_removed.txt" );
                		print OUT4 "$genline\t$line\n"; 
            		}
			unless ( $sum2 == 0 ) {  
				$gendata[-1] = $sum2;
				my $genline2 = join( "\t", @gendata );                		
				my $line2 = join( "\t", @counts2 );
                		open( OUT5, ">>$exp/Final_validated_sequences_invalid_reads_retained.txt" );
                		print OUT5 "$genline2\t$line2\n";
			}         	
		}
    	}
    }    	
    close(IN);
    close(OUT);
    close(FNA);
}

#now reopen Preliminary_validated_sequences.txt.
#for each sequence, check if valid in any individual sample.
#Then run original confidence calculation...

open( IN,  "<$ARGV[0]" );
while (<IN>) {
	s/\r?\n//;
	my $safe = $_;        
	my @tmp        = split( /\t/, $_ );
	my @tmp2       = @tmp;
	my $name       = $tmp[0];	
	my $firstmid   = $ARGV[1] - 1;
	my $totalscol  = $ARGV[1] - 2;
	my @gendata    = @tmp[0..$totalscol];
	my $geninfo    = join( "\t", @gendata ); 
	if (/^name/) {
		mkdir 'ALL_SAMPLES';		
		$tmp[1]    = 'confidence';
		my $header = join ( "\t", @tmp );			
		open ( OUT6, ">ALL_SAMPLES/Final_validated_sequences_All_Samples_invalid_reads_removed.txt" );
		open ( OUT7, ">ALL_SAMPLES/Invalid_or_ambiguous_sequences.txt" );
		open ( OUT8, ">ALL_SAMPLES/Final_validated_sequences_All_Samples_invalid_reads_retained.txt" );		
		print OUT6 "$header\n";
		print OUT7 "$safe\n";
		print OUT8 "$header\n";		
		close (OUT6);
		close (OUT7);
		close (OUT8);
	}
	elsif ( exists ( $anysample{$name} ) ) { #valid in at least one sample
		my $consum = 0;		
		for my $exp ( keys %samples ) {
			my @mids = @{ $samples{$exp} };			
			if ( $record{$exp}{$name} == 0 ) { #sequence is invalid in the sample
				
				for (@mids) {
					$tmp[ $_ + $firstmid ] = 0; #both read numbers converted to 0s if sequence is invalid
				}
			}
			else {
				my @mids = @{ $samples{$exp} };
				for (@mids) {
					if ( $tmp[ $_ + $firstmid ] >= $detect ) {
						$consum += $tmp[ $_ + $firstmid ];
					}				
				}		
			}		
		}
		my $confidence   = ( $consum / $tmp[$totalscol] );
		$tmp[1]          = sprintf( "%.3f", $confidence );
		$tmp2[1]         = sprintf( "%.3f", $confidence );	
		$tmp[$totalscol] = $consum;		
		my $invalidreadsremoved = join ( "\t", @tmp );	
		open ( ALLOUT, ">>ALL_SAMPLES/Final_validated_sequences_All_Samples_invalid_reads_removed.txt" );
		print ALLOUT "$invalidreadsremoved\n";
		my $invalidreadsretained = join ( "\t", @tmp2 );
		open ( ALLOUT1, ">>ALL_SAMPLES/Final_validated_sequences_All_Samples_invalid_reads_retained.txt" );
		print ALLOUT1 "$invalidreadsretained\n";
	}
	else {		
		open ( ALLOUT, ">>ALL_SAMPLES/Invalid_or_ambiguous_sequences.txt" );
		print ALLOUT "$safe\n";
	}
}
close (IN);
close (ALLOUT);

my $took = time - $^T;
print "Took $took seconds to complete\n";
print "DONE\n";
